Author: Bram Van de Sande
Date: 6 AUG 2019
Outline: SCENIC workflow for selected scRNAseq experiment on human cancer biopsies.
| Accession ID | Cancer type |
|---|---|
| GSE115978 | SKCM |
| GSE103322 | HNSC |
| E-MTAB-6149/E-MTAB-6653 | LUSC, LUAD |
Cancer type acronyms are taken from TCGA: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tcga-study-abbreviations
This notebook can be run in a miniconda environment which can be installed and configured as a Jupyter kernel via the following script: https://github.com/aertslab/pySCENIC/blob/master/miniconda_environment_installation.sh
import os, glob, re, pickle
from functools import partial
from collections import OrderedDict
import operator as op
from cytoolz import compose
import pandas as pd
import seaborn as sns
import numpy as np
import scanpy as sc
import anndata as ad
import matplotlib as mpl
import matplotlib.pyplot as plt
from pyscenic.export import export2loom, add_scenic_metadata
from pyscenic.utils import load_motifs
from pyscenic.transform import df2regulons
from pyscenic.aucell import aucell
from pyscenic.binarization import binarize
from pyscenic.rss import regulon_specificity_scores
from pyscenic.plotting import plot_binarization, plot_rss
from IPython.display import HTML, display
# Set maximum number of jobs for Scanpy.
sc.settings.njobs = 32
Folder structure.
RESOURCES_FOLDERNAME = "/Users/bramvandesande/Projects/lcb/protocol/resources/"
AUXILLIARIES_FOLDERNAME = "/Users/bramvandesande/Projects/lcb/protocol/auxilliaries/"
RESULTS_FOLDERNAME = "/Users/bramvandesande/Projects/lcb/protocol/results/"
FIGURES_FOLDERNAME = "/Users/bramvandesande/Projects/lcb/protocol/figures/"
sc.settings.figdir = FIGURES_FOLDERNAME
Auxilliary functions.
BASE_URL = "http://motifcollections.aertslab.org/v9/logos/"
COLUMN_NAME_LOGO = "MotifLogo"
COLUMN_NAME_MOTIF_ID = "MotifID"
COLUMN_NAME_TARGETS = "TargetGenes"
def savesvg(fname: str, fig, folder: str=FIGURES_FOLDERNAME) -> None:
"""
Save figure as vector-based SVG image format.
"""
fig.tight_layout()
fig.savefig(os.path.join(folder, fname), format='svg')
def display_logos(df: pd.DataFrame, top_target_genes: int = 3, base_url: str = BASE_URL):
"""
:param df:
:param base_url:
"""
# Make sure the original dataframe is not altered.
df = df.copy()
# Add column with URLs to sequence logo.
def create_url(motif_id):
return '<img src="{}{}.png" style="max-height:124px;"></img>'.format(base_url, motif_id)
df[("Enrichment", COLUMN_NAME_LOGO)] = list(map(create_url, df.index.get_level_values(COLUMN_NAME_MOTIF_ID)))
# Truncate TargetGenes.
def truncate(col_val):
return sorted(col_val, key=op.itemgetter(1))[:top_target_genes]
df[("Enrichment", COLUMN_NAME_TARGETS)] = list(map(truncate, df[("Enrichment", COLUMN_NAME_TARGETS)]))
MAX_COL_WIDTH = pd.get_option('display.max_colwidth')
pd.set_option('display.max_colwidth', -1)
display(HTML(df.head().to_html(escape=False)))
pd.set_option('display.max_colwidth', MAX_COL_WIDTH)
Auxilliary data sets.
# Downloaded fromm pySCENIC github repo: https://github.com/aertslab/pySCENIC/tree/master/resources
HUMAN_TFS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'lambert2018.txt')
# Ranking databases. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
RANKING_DBS_FNAMES = list(map(lambda fn: os.path.join(AUXILLIARIES_FOLDERNAME, fn),
['hg19-500bp-upstream-10species.mc9nr.feather',
'hg19-tss-centered-5kb-10species.mc9nr.feather',
'hg19-tss-centered-10kb-10species.mc9nr.feather']))
# Motif annotations. Downloaded from cisTargetDB: https://resources.aertslab.org/cistarget/
MOTIF_ANNOTATIONS_FNAME = os.path.join(AUXILLIARIES_FOLDERNAME, 'motifs-v9-nr.hgnc-m0.001-o0.0.tbl')
Publication: Jerby-Arnon L et al. A Cancer Cell Program Promotes T Cell Exclusion and Resistance to Checkpoint Blockade. Cell 2018 https://dx.doi.org/10.1016/j.cell.2018.09.006
Data set acquisition: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115978
Remarks:
| Code | Description |
|---|---|
| OR | Objective Response |
| ICI | Immune Checkpoint Inhibitor |
DATASET_ID = "GSE115978"
TCGA_CODE = 'SKCM'
Resources downloaded.
# Downloaded from GEO on 28 FEB 2019.
CELL_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDERNAME, "GSE115978_cell.annotations.csv")
# Downloaded from Cell Journal website on 1 MAR 2019.
SAMPLE_METADATA_FNAME = os.path.join(RESOURCES_FOLDERNAME, "1-s2.0-S0092867418311784-mmc1.xlsx")
# Downloaded from GEO on 1 MAR 2019.
EXP_MTX_TPM_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'GSE115978_tpm.csv')
EXP_MTX_COUNTS_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'GSE115978_counts.csv')
Results created.
METADATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.metadata.csv'.format(DATASET_ID))
EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.qc.tpm.csv'.format(DATASET_ID))
ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}_{}.loom'.format(TCGA_CODE, DATASET_ID))
METADATA CLEANING
df_annotations = pd.read_csv(CELL_ANNOTATIONS_FNAME)
df_annotations['samples'] = df_annotations['samples'].str.upper()
df_annotations.rename(columns={'cell.types': 'cell_type', 'cells': 'cell_id', 'samples': 'sample_id',
'treatment.group': 'treatment_group', 'Cohort': 'cohort'}, inplace=True)
df_annotations['cell_type'] = df_annotations.cell_type.replace({
'Mal': 'Malignant Cell',
'Endo.': 'Endothelial Cell',
'T.CD4': 'Thelper Cell',
'CAF': 'Fibroblast',
'T.CD8': 'Tcytotoxic Cell',
'T.cell': 'T Cell',
'NK': 'NK Cell',
'B.cell': 'B Cell'})
df_samples = pd.read_excel(SAMPLE_METADATA_FNAME, header=2)
df_samples = df_samples.drop(['Cohort'], axis=1)
df_samples['Sample'] = df_samples.Sample.str.upper()
df_metadata = pd.merge(df_annotations, df_samples, left_on='sample_id', right_on='Sample')
df_metadata = df_metadata.drop(['Sample', 'treatment_group'], axis=1)
df_metadata.rename(columns={'Patient': 'patient_id',
'Age': 'age', 'Sex': 'sex', 'Lesion type': 'lesion_type', 'Site': 'site',
'Treatment': 'treatment', 'Treatment group': 'treatment_group'}, inplace=True)
df_metadata.to_csv(METADATA_FNAME, index=False)
df_metadata.head()
EXPRESSION MATRIX QC
df_tpm = pd.read_csv(EXP_MTX_TPM_FNAME, index_col=0)
df_tpm.shape
#df_counts = pd.read_csv(EXP_MTX_COUNTS_FNAME, index_col=0)
#df_counts.shape
adata = sc.AnnData(X=df_tpm.T.sort_index())
df_obs = df_metadata[['cell_id', 'sample_id', 'cell_type', 'cohort', 'patient_id', 'age', 'sex', 'treatment',
'treatment_group', 'lesion_type', 'site']].set_index('cell_id').sort_index()
adata.obs = df_obs
adata.var_names_make_unique()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
# Store non-log transformed data as raw. This data can be used via the use_raw parameters available for many functions.
# In the scanpy's tutorials this is used to stored all genes in log-transformed counts before retaining only Highly Variable Genes (HVG).
# Because in this case no filtering is done we use this feature to store raw counts.
adata.raw = adata
sc.pp.log1p(adata)
adata
adata.write_h5ad(ANNDATA_FNAME) # Categorical dtypes are created.
adata.to_df().to_csv(EXP_MTX_QC_FNAME)
For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.
Output: List of adjacencies between a TF and its targets stored in ADJACENCIES_FNAME.
2019-04-25 11:22:20,360 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-04-25 11:23:37,612 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
preparing dask client
parsing input
creating dask graph
4 partitions
computing dask graph
not shutting down client, client was created externally
finished
2019-04-26 03:58:01,096 - pyscenic.cli.pyscenic - INFO - Writing results to file.
For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.
Output: List of adjacencies between a TF and its targets stored in MOTIFS_FNAME.
DBS_PARAM = ' '.join(RANKING_DBS_FNAMES)
2019-04-26 10:23:50,395 - pyscenic.cli.pyscenic - INFO - Creating modules.
2019-04-26 10:23:56,229 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-04-26 10:29:09,101 - pyscenic.cli.pyscenic - INFO - Loading databases.
2019-04-26 10:29:09,101 - pyscenic.cli.pyscenic - INFO - Calculating regulons.
2019-04-26 11:11:10,784 - pyscenic.cli.pyscenic - INFO - Writing results to file
The results is a list of enriched motifs for the modules.
| Column name | Description |
|---|---|
| TF | Transcription Factor (TF) for which an enriched motif is discovered. |
| motifID | The identifier of the enriched motif. |
| AUC | Area Under the recovery Curve statistic for this enriched motif. |
| NES | Normalized Enrichment Score for this enriched motif. |
| Context | Collection of tags clarifying the origin of the module for this factor: e.g. ranking database, ... |
| Annotation | Verbose description of the annotation available for this motif. |
| MotifSimilarityQvalue | The TomTom derived Q-value for motif similarity (if used for assigning the factor to this enriched motif). |
| OrthologousIdentity | The Amino Acid Identity between factors (if used for assigning the factor to this enriched motif). |
| RankAtMax | The position of the Leading Edge which is used as a threshold on the whole genome ranking of the motif to decide if a gene in the input is a direct target of a TF that binds this motif. |
| TargetGenes | A list of pairs: genes and their associated weights from GENIE3/GRNBoost2. |
df_motifs = load_motifs(MOTIFS_FNAME)
df_motifs.head()
Display the enriched motifs with their associated sequence logos.
display_logos(df_motifs.head())
REGULON CREATION
Regulons can easily be created from this list of enriched motifs via pyscenic.transform.df2regulons. Here we provide an auxilliary function to carefully select the enriched motifs that contribute to the regulons.
def derive_regulons(motifs, db_names=('hg19-tss-centered-10kb-10species',
'hg19-500bp-upstream-10species',
'hg19-tss-centered-5kb-10species')):
motifs.columns = motifs.columns.droplevel(0)
def contains(*elems):
def f(context):
return any(elem in context for elem in elems)
return f
# For the creation of regulons we only keep the 10-species databases and the activating modules. We also remove the
# enriched motifs for the modules that were created using the method 'weight>50.0%' (because these modules are not part
# of the default settings of modules_from_adjacencies anymore.
motifs = motifs[
np.fromiter(map(compose(op.not_, contains('weight>50.0%')), motifs.Context), dtype=np.bool) & \
np.fromiter(map(contains(*db_names), motifs.Context), dtype=np.bool) & \
np.fromiter(map(contains('activating'), motifs.Context), dtype=np.bool)]
# We build regulons only using enriched motifs with a NES of 3.0 or higher; we take only directly annotated TFs or TF annotated
# for an orthologous gene into account; and we only keep regulons with at least 10 genes.
regulons = list(filter(lambda r: len(r) >= 10, df2regulons(motifs[(motifs['NES'] >= 3.0)
& ((motifs['Annotation'] == 'gene is directly annotated')
| (motifs['Annotation'].str.startswith('gene is orthologous to')
& motifs['Annotation'].str.endswith('which is directly annotated for motif')))
])))
# Rename regulons, i.e. remove suffix.
return list(map(lambda r: r.rename(r.transcription_factor), regulons))
regulons = derive_regulons(df_motifs)
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
pickle.dump(regulons, f)
AUCELL
CPU times: user 23.1 s, sys: 10.7 s, total: 33.8 s
Wall time: 39.2 s
auc_mtx = pd.read_csv(AUCELL_MTX_FNAME, index_col=0)
100%|██████████| 370/370 [33:21<00:00, 3.39s/it]
bin_mtx = pd.read_csv(BIN_MTX_FNAME, index_col=0)
thresholds = pd.read_csv(THR_FNAME, index_col=0).threshold
fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(8, 4), dpi=100)
plot_binarization(auc_mtx, 'NFKB2', thresholds['NFKB2'], ax=ax1)
plot_binarization(auc_mtx, 'MITF', thresholds['MITF'], ax=ax2)
plot_binarization(auc_mtx, 'FOXP3', thresholds['FOXP3'], ax=ax3)
plot_binarization(auc_mtx, 'PAX5', thresholds['PAX5'], ax=ax4)
plot_binarization(auc_mtx, 'IRF8', thresholds['IRF8'], ax=ax5)
plot_binarization(auc_mtx, 'IRF3', thresholds['IRF3'], ax=ax6)
plot_binarization(auc_mtx, 'MLX', thresholds['MLX'], ax=ax7)
plot_binarization(auc_mtx, 'YY1', thresholds['YY1'], ax=ax8)
plt.tight_layout()
savesvg('hists - GSE115978 - binarization.svg', fig)
Create heatmap with binarized regulon activity.
def palplot(pal, names, colors=None, size=1):
n = len(pal)
f, ax = plt.subplots(1, 1, figsize=(n * size, size))
ax.imshow(np.arange(n).reshape(1, n),
cmap=mpl.colors.ListedColormap(list(pal)),
interpolation="nearest", aspect="auto")
ax.set_xticks(np.arange(n) - .5)
ax.set_yticks([-.5, .5])
ax.set_xticklabels([])
ax.set_yticklabels([])
colors = n * ['k'] if colors is None else colors
for idx, (name, color) in enumerate(zip(names, colors)):
ax.text(0.0+idx, 0.0, name, color=color, horizontalalignment='center', verticalalignment='center')
return f
N_COLORS = len(adata.obs.cell_type.dtype.categories)
COLORS = [color['color'] for color in mpl.rcParams["axes.prop_cycle"]]
cell_type_color_lut = dict(zip(adata.obs.cell_type.dtype.categories, COLORS))
#cell_type_color_lut = dict(zip(adata.obs.cell_type.dtype.categories, adata.uns['cell_type_colors']))
cell_id2cell_type_lut = df_metadata.set_index('cell_id').cell_type.to_dict()
bw_palette = sns.xkcd_palette(["white", "black"])
sns.set()
sns.set_style("whitegrid")
fig = palplot(bw_palette, ['OFF', 'ON'], ['k', 'w'])
savesvg('legend - GSE115978 - on_off.svg', fig)
sns.set()
sns.set(font_scale=0.8)
fig = palplot(sns.color_palette(COLORS), adata.obs.cell_type.dtype.categories, size=1.0)
savesvg('legend - GSE115978 - cell_type_colors.svg', fig)
sns.set()
sns.set(font_scale=1.0)
sns.set_style("ticks", {"xtick.minor.size": 1, "ytick.minor.size": 0.1})
g = sns.clustermap(bin_mtx.T,
col_colors=auc_mtx.index.map(cell_id2cell_type_lut).map(cell_type_color_lut),
cmap=bw_palette, figsize=(20,20))
g.ax_heatmap.set_xticklabels([])
g.ax_heatmap.set_xticks([])
g.ax_heatmap.set_xlabel('Cells')
g.ax_heatmap.set_ylabel('Regulons')
g.ax_col_colors.set_yticks([0.5])
g.ax_col_colors.set_yticklabels(['Cell Type'])
g.cax.set_visible(False)
g.fig.savefig(os.path.join(FIGURES_FOLDERNAME, 'clustermap - GSE115978.png'), format='png')
Save clustered binarized heatmap to Excel for further inspection.
bin_mtx_clustered = bin_mtx.T.copy()
bin_mtx_clustered.rename(columns=df_annotations.set_index('cell_id')['cell_type'].to_dict(), inplace=True)
Generate sequence logos.
def fetch_logo(regulon, base_url = BASE_URL):
for elem in regulon.context:
if elem.endswith('.png'):
return '<img src="{}{}" style="max-height:124px;"></img>'.format(base_url, elem)
return ""
df_regulons = pd.DataFrame(data=[list(map(op.attrgetter('name'), regulons)),
list(map(len, regulons)),
list(map(fetch_logo, regulons))], index=['name', 'count', 'logo']).T
MAX_COL_WIDTH = pd.get_option('display.max_colwidth')
pd.set_option('display.max_colwidth', -1)
display(HTML(df_regulons.head().to_html(escape=False)))
pd.set_option('display.max_colwidth', MAX_COL_WIDTH)
In this step a scanpy.AnnData object is created containing all metadata and results.
First, we select and retain only the most variable genes in the dataset.
PCA + tSNE PROJECTION
sns.set()
sc.pp.highly_variable_genes(adata)
sc.pl.highly_variable_genes(adata)
adata = adata[:, adata.var['highly_variable']]
Then we apply a linear-dimensional reduction technique (PCA).
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color='CD8A')
sc.pl.pca_variance_ratio(adata, log=True)
Followed by a tSNE projection.
sc.tl.tsne(adata)
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type', 'patient_id'],
title=['GSE115978 - SKCM - Cell types', 'GSE115978 - SKCM - Lesion types',
'GSE115978 - SKCM - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3, palette=COLORS,
save=' - GSE115978 - PCA+tSNE.svg')
We capture the non-linear projection based on PCA+tSNE for later storage in the loom file.
embedding_pca_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)
We add all metadata derived from SCENIC to the scanpy.AnnData object.
add_scenic_metadata(adata, auc_mtx, regulons)
adata.write_h5ad(ANNDATA_FNAME)
AUCELL + tSNE PROJECTION
We change the tSNE projection so that it relies on AUCell instead of PCA.
sc.tl.tsne(adata, use_rep='X_aucell')
embedding_aucell_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type', 'patient_id'],
title=['GSE115978 - SKCM - Cell types', 'GSE115978 - SKCM - Lesion types',
'GSE115978 - SKCM - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3, palette=COLORS,
save=' - GSE115978 - AUCell+tSNE.svg')
CELL TYPE SPECIFIC REGULATORS - Z-SCORE
To find cell type specific regulators we use a Z score (i.e. the average AUCell score for the cells of a give type are standardized using the overall average AUCell scores and its standard deviation).
df_obs = adata.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['cell_type']]
df_results = ((df_scores.groupby(by='cell_type').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))
df_results[(df_results.Z >= 3.0)].sort_values('Z', ascending=False).head()
df_heatmap = pd.pivot_table(data=df_results[df_results.Z >= 3.0].sort_values('Z', ascending=False),
index='cell_type', columns='regulon', values='Z')
#df_heatmap.drop(index='Myocyte', inplace=True) # We leave out Myocyte because many TFs are highly enriched (becuase of small number of cells).
fig, ax1 = plt.subplots(1, 1, figsize=(10, 8))
sns.heatmap(df_heatmap, ax=ax1, annot=True, fmt=".1f", linewidths=.7, cbar=False, square=True, linecolor='gray',
cmap="YlGnBu", annot_kws={"size": 6})
ax1.set_ylabel('')
savesvg('heatmap - GSE115978 - regulons.svg', fig)
CELL TYPE SPECIFIC REGULATORS - RSS
rss = regulon_specificity_scores(auc_mtx, adata.obs.cell_type)
rss.head()
sns.set()
sns.set(style='whitegrid', font_scale=0.8)
fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(8, 6), dpi=100)
plot_rss(rss, 'B Cell', ax=ax1)
ax1.set_xlabel('')
plot_rss(rss, 'T Cell', ax=ax2)
ax2.set_xlabel('')
ax2.set_ylabel('')
plot_rss(rss, 'Thelper Cell', ax=ax3)
ax3.set_xlabel('')
ax3.set_ylabel('')
plot_rss(rss, 'Tcytotoxic Cell', ax=ax4)
ax4.set_xlabel('')
ax4.set_ylabel('')
plot_rss(rss, 'NK Cell', ax=ax5)
plot_rss(rss, 'Fibroblast', ax=ax6)
ax6.set_ylabel('')
plot_rss(rss, 'Macrophage', ax=ax7)
ax7.set_ylabel('')
plot_rss(rss, 'Endothelial Cell', ax=ax8)
ax8.set_ylabel('')
plt.tight_layout()
savesvg('plots - GSE115978 - rss.svg', fig)
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'PAX5', 'TCF7', 'EOMES', 'TBX21', 'PRRX2', 'MAFB'],
title=['GSE115978 - SKCM - Cell types', 'PAX5', 'TCF7', 'EOMES', 'TBX21', 'PRRX2', 'MAFB'], ncols=4, use_raw=False,
save=' - GSE115978 - rss_gene.svg', palette=COLORS, cmap='magma')
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'Regulon(PAX5)', 'Regulon(TCF7)', 'Regulon(EOMES)', 'Regulon(TBX21)', 'Regulon(PRRX2)', 'Regulon(MAFB)', 'Regulon(SOX7)'],
title=['GSE115978 - SKCM - Cell types', 'Regulon(PAX5)', 'Regulon(TCF7)', 'Regulon(EOMES)', 'Regulon(TBX21)', 'Regulon(PRRX2)', 'Regulon(MAFB)', 'Regulon(SOX7)'], ncols=4, use_raw=False,
save=' - GSE115978 - rss_regulon.svg', palette=COLORS, cmap='viridis')
SELECTED REGULONS
With the metadata from SCENIC add to the AnnData object, we can create cellular scatterplots for regulon activity. In the example we compare MITF regulon enrichment with the expression of the MITF gene.
sc.pl.tsne(adata, color=['cell_type', 'Regulon(MITF)', 'Regulon(TWIST1)'],
title=['GSE115978 - SKCM - Cell types', 'Regulon(MITF)', 'Regulon(TWIST1)'], ncols=3, use_raw=False,
save=' - GSE115978 - selected_regulons1.svg', palette=COLORS, cmap='viridis')
sc.pl.tsne(adata, color=['cell_type', 'MITF'],
title=['GSE115978 - SKCM - Cell types', 'MITF'], ncols=3, use_raw=False,
save=' - GSE115978 - selected_regulons2.svg', palette=COLORS, cmap='magma')
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'MYC'],
title=['GSE115978 - SKCM - Cell types', 'MYC - Gene'], ncols=3, palette=COLORS, use_raw=False,
save=' - GSE115978 - AUCell+tSNE - Gene MYC.svg', cmap='magma')
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'Regulon(MYC)'],
title=['GSE115978 - SKCM - Cell types', 'MYC - Regulon'], ncols=3, palette=COLORS, use_raw=False,
save=' - GSE115978 - AUCell+tSNE - Regulon MYC.svg', cmap='viridis')
AUCELL DISTRIBUTIONS
We can also analyze the distribution of AUCell values via the plots in the scanpy package.
df_results.sort_values('Z', ascending=False).groupby(by='cell_type').head(2)
aucell_adata = sc.AnnData(X=auc_mtx.sort_index())
aucell_adata.obs = df_obs
names = list(map(op.attrgetter('name'), filter(lambda r: r.score > 8.0, regulons)))
sc.pl.stacked_violin(aucell_adata, names, groupby='cell_type',
save=' - GSE115978 - regulons.svg')
export2loom(adata.to_df(), regulons, LOOM_FNAME,
cell_annotations=adata.obs['cell_type'].to_dict(),
embeddings=OrderedDict([('AUCell + tSNE', embedding_aucell_tsne), ('PCA + tSNE', embedding_pca_tsne)]),
auc_mtx = auc_mtx,
tree_structure=(),
title='{}_{}'.format(TCGA_CODE, DATASET_ID),
nomenclature="HGNC", auc_thresholds=thresholds,
compress=True)
Publication: Puram S et al. Single-Cell Transcriptomic Analysis of Primary and Metastatic Tumor Ecosystems in Head and Neck Cancer. Cell 2017 https://dx.doi.org/10.1016/j.cell.2017.10.044
Data set acquisition: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE103322
DATASET_ID = "GSE103322"
TCGA_CODE = 'HNSC'
Resources downloaded.
# Downloaded from GEO on 15 JUN 2019.
ALL_FNAME = os.path.join(RESOURCES_FOLDERNAME, "GSE103322_HNSCC_all_data.txt")
Results created.
METADATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.metadata.csv'.format(DATASET_ID))
EXP_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.tpm.csv'.format(DATASET_ID))
EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.qc.tpm.csv'.format(DATASET_ID))
ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}_{}.loom'.format(TCGA_CODE, DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))
CLEANING
def process_gse103322(fname):
# Load CSV file
mtx = pd.read_csv(fname, sep='\t', index_col=0, skiprows=[1,2,3,4,5])
# Extract gene symbol
mtx.index = list(map(lambda g: g[1:-1], mtx.index))
# Remove duplicate gene symbols (keep last when sorted ascending order according to row sum)
mtx = mtx.iloc[mtx.sum(axis=1).argsort()]
mtx = mtx[~mtx.index.duplicated(keep='last')]
return mtx
df_mtx = process_gse103322(ALL_FNAME)
df_mtx.to_csv(EXP_MTX_FNAME, index=True)
cell_annotations = pd.read_csv(ALL_FNAME, sep='\t', index_col=0, low_memory=False).head(5)
cell_type = pd.Series(data=list(map(lambda t: t[0] if t[1] else 'Malignant Cell',
zip(cell_annotations.T['non-cancer cell type'],
cell_annotations.T['classified as non-cancer cells'].map(float).map(bool)))),
index = cell_annotations.columns, name='cell_type').replace({'-Fibroblast': 'Fibroblast',
'myocyte': 'Myocyte',
'Endothelial': 'Endothelial Cell',
'Dendritic': 'Dendritic Cell',
'Mast': 'Mast Cell',
'T cell': 'T Cell',
'B cell': 'B Cell'})
lesion_type = pd.Series(data=list(map(lambda t: 'Lymph node' if t else 'Primary tumor',
cell_annotations.T['Lymph node'].map(float).map(bool))),
index = cell_annotations.columns, name='lesion_type')
def clean_pid(s: str) -> str:
m = re.match("HN(SCC)?_?(\d{1,2})_", s)
if m:
return 'MEEI{}'.format(int(m.group(2)))
else:
return 'MEEI9'
patient_id = pd.Series(data=cell_annotations.columns.map(clean_pid), index = cell_annotations.columns, name='patient_id')
df_metadata = pd.concat([cell_type, lesion_type, patient_id], axis=1).reset_index().rename(columns={'index': 'cell_id'})
df_metadata.to_csv(METADATA_FNAME, sep=',')
df_metadata.head()
EXPRESSION MATRIX QC
adata = sc.AnnData(X=df_mtx.T.sort_index())
df_obs = df_metadata.set_index('cell_id').sort_index()
adata.obs = df_obs
adata.var_names_make_unique()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata.raw = adata #Store non-log transformed data as raw. This data can be used via the use_raw parameters available for many functions.
sc.pp.log1p(adata)
adata
adata.to_df().to_csv(EXP_MTX_QC_FNAME)
For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.
Output: List of adjacencies between a TF and its targets stored in ADJACENCIES_FNAME.
2019-04-25 09:07:25,605 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-04-25 09:08:25,870 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
preparing dask client
parsing input
creating dask graph
32 partitions
computing dask graph
not shutting down client, client was created externally
finished
2019-04-25 10:55:14,802 - pyscenic.cli.pyscenic - INFO - Writing results to file.
For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.
Output: List of adjacencies between a TF and its targets stored in MOTIFS_FNAME.
2019-04-26 09:44:55,314 - pyscenic.cli.pyscenic - INFO - Creating modules.
2019-04-26 09:45:00,335 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-04-26 09:49:37,344 - pyscenic.cli.pyscenic - INFO - Loading databases.
2019-04-26 09:49:37,345 - pyscenic.cli.pyscenic - INFO - Calculating regulons.
2019-04-26 10:22:54,812 - pyscenic.cli.pyscenic - INFO - Writing results to file.
df_motifs = load_motifs(MOTIFS_FNAME)
REGULON CREATION
regulons = derive_regulons(df_motifs)
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
pickle.dump(regulons, f)
AUCELL
`
CPU times: user 18.2 s, sys: 8.92 s, total: 27.2 s
Wall time: 30.7 s
auc_mtx = pd.read_csv(AUCELL_MTX_FNAME, index_col=0)
In this step a scanpy.AnnData object is created containing all metadata and results.
First, we select and retain only the most variable genes in the dataset.
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var['highly_variable']]
Then we apply a linear-dimensional reduction technique (PCA).
sc.tl.pca(adata, svd_solver='arpack')
Followed by a tSNE projection.
sc.tl.tsne(adata)
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type','patient_id'],
title=['GSE103322 - HNSC - Cell types', 'GSE103322 - HNSC - Lesion types',
'GSE103322 - HNSC - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3,
save=' - GSE103322 - PCA+tSNE.svg')
We capture the non-linear projection based on PCA+tSNE for later storage in the loom file.
embedding_pca_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)
We add all metadata derived from SCENIC to the scanpy.AnnData object.
add_scenic_metadata(adata, auc_mtx, regulons)
adata.write_h5ad(ANNDATA_FNAME)
We change the tSNE projection so that it relies on AUCell instead of PCA.
sc.tl.tsne(adata, use_rep='X_aucell')
embedding_aucell_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type','patient_id'],
title=['GSE103322 - HNSC - Cell types', 'GSE103322 - HNSC - Lesion types',
'GSE103322 - HNSC - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3,
save=' - GSE103322 - AUCell+tSNE.svg')
CELL TYPE SPECIFIC REGULATORS - RSS
rss = regulon_specificity_scores(auc_mtx, adata.obs.cell_type)
rss.head()
sns.set()
sns.set(style='whitegrid', font_scale=0.8)
fig, ((ax1, ax2, ax3, ax4), (ax5, ax6, ax7, ax8)) = plt.subplots(2, 4, figsize=(8, 6), dpi=100)
plot_rss(rss, 'B Cell', ax=ax1)
ax1.set_xlabel('')
plot_rss(rss, 'T Cell', ax=ax2)
ax2.set_xlabel('')
ax2.set_ylabel('')
plot_rss(rss, 'Mast Cell', ax=ax3)
ax3.set_xlabel('')
ax3.set_ylabel('')
plot_rss(rss, 'Dendritic Cell', ax=ax4)
ax4.set_xlabel('')
ax4.set_ylabel('')
plot_rss(rss, 'Myocyte', ax=ax5)
plot_rss(rss, 'Fibroblast', ax=ax6)
ax6.set_ylabel('')
plot_rss(rss, 'Macrophage', ax=ax7)
ax7.set_ylabel('')
plot_rss(rss, 'Endothelial Cell', ax=ax8)
ax8.set_ylabel('')
plt.tight_layout()
savesvg('plots - GSE103322 - rss.svg', fig)
CELL TYPE SPECIFIC REGULATORS - Z-SCORE
To find cell type specific regulators we use a Z score (i.e. the average AUCell score for the cells of a give type are standardized using the overall average AUCell scores and its standard deviation).
df_obs = adata.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['cell_type']]
df_results = ((df_scores.groupby(by='cell_type').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))
df_results[(df_results.Z >= 3.0)].sort_values('Z', ascending=False).head()
df_heatmap = pd.pivot_table(data=df_results[df_results.Z >= 3.0].sort_values('Z', ascending=False),
index='cell_type', columns='regulon', values='Z')
#df_heatmap.drop(index='Myocyte', inplace=True) # We leave out Myocyte because many TFs are highly enriched (becuase of small number of cells).
fig, ax1 = plt.subplots(1, 1, figsize=(10, 8))
sns.heatmap(df_heatmap, ax=ax1, annot=True, fmt=".1f", linewidths=.7, cbar=False, square=True, linecolor='gray',
cmap="YlGnBu", annot_kws={"size": 6})
ax1.set_ylabel('')
savesvg('heatmap - GSE103322 - regulons.svg', fig)
sc.pl.tsne(adata, color=['cell_type', 'Regulon(NFE2)', 'Regulon(XBP1)'],
title=['GSE103322 - HNSC - Cell types', 'NFE2', 'XBP1'], ncols=3, use_raw=False,
save=' - GSE103322 - regulons.svg')
SELECTED REGULONS
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'MYC'],
title=['GSE103322 - HNSC - Cell types', 'MYC - Gene'], ncols=3, palette=COLORS, use_raw=False,
save=' - GSE103322 - AUCell+tSNE - Gene MYC.svg', cmap='magma')
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'Regulon(MYC)'],
title=['GSE103322 - HNSC - Cell types', 'MYC - Regulon'], ncols=3, palette=COLORS, use_raw=False,
save=' - GSE103322 - AUCell+tSNE - Regulon MYC.svg', cmap='viridis')
DISCOVERY OF MORE FINE-GRAINED CELL TYPES AND THEIR REGULATORS
In this notebook we'll subcluster the fibroblasts and find regulators that define these subtypes of fibroblasts. More specifically, we create a nearest neighbour graph on the AUCell-based dimensional reduced cell space. Louvain community detection is used on the resulting graph to clusters cells.
sc.pp.neighbors(adata, use_rep='X_aucell', n_neighbors=3)
sc.tl.louvain(adata)
counter = 1
def init(data, col_name: str = 'cellular_phenotype'):
data.obs[col_name] = data.obs['cell_type'].astype(str)
return data
def subcluster(cell_type: str, abbr: str, data, min_cells: int = 20, col_name: str = 'cellular_phenotype'):
global counter
counter = 1
ct_idx = data.obs[data.obs.cell_type == cell_type].index
df_cell_numbers = data[ct_idx, :].obs.louvain.value_counts()
clusters = df_cell_numbers[df_cell_numbers >= min_cells].index
def rename(n: str) -> str:
global counter
if n in clusters:
res = '{}{}'.format(abbr, counter)
counter += 1
else:
res = '?'
return res
data.obs.loc[ct_idx, [col_name]] = data[ct_idx, :].obs['louvain'].map(rename).values
return data
adata = init(adata)
adata = subcluster('Fibroblast', 'F', adata)
adata.write_h5ad(ANNDATA_FNAME)
adata.obs.cellular_phenotype.dtype.categories
N_COLORS = len(adata.obs.cellular_phenotype.dtype.categories)
COLORS = [color['color'] for color in mpl.rcParams["axes.prop_cycle"]]
sc.pl.tsne(adata, color=['cellular_phenotype'],
title=['GSE103322 - HNSC - Cell types'], ncols=3, use_raw=False, palette=sns.color_palette("Set3"), legend_loc='on data')
We use RSS to identify regulators for these different types of fibroblasts.
rss = regulon_specificity_scores(auc_mtx, adata.obs.cellular_phenotype)
rss.head()
sns.set()
sns.set(style='whitegrid', font_scale=0.8)
fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(1, 4, figsize=(8, 3), dpi=100)
plot_rss(rss, 'F1', ax=ax1)
plot_rss(rss, 'F2', ax=ax2)
ax2.set_ylabel('')
plot_rss(rss, 'F3', ax=ax3)
ax3.set_ylabel('')
plot_rss(rss, 'F4', ax=ax4)
ax3.set_ylabel('')
savesvg('plots - GSE103322 - rss - fibroblasts.svg', fig)
bin_mtx, thresholds = binarize(auc_mtx)
bin_mtx.to_csv(BIN_MTX_FNAME)
thresholds.to_frame().rename(columns={0:'threshold'}).to_csv(THR_FNAME)
bin_mtx = pd.read_csv(BIN_MTX_FNAME, index_col=0)
thresholds = pd.read_csv(THR_FNAME, index_col=0).threshold
export2loom(adata.to_df(), regulons, LOOM_FNAME,
cell_annotations=adata.obs['cell_type'].to_dict(),
embeddings=OrderedDict([('AUCell + tSNE', embedding_aucell_tsne), ('PCA + tSNE', embedding_pca_tsne)]),
auc_mtx = auc_mtx,
tree_structure=(),
title='{}_{}'.format(TCGA_CODE, DATASET_ID),
nomenclature="HGNC", auc_thresholds=thresholds,
compress=True)
Publication: Lambrechts D et al. Phenotype molding of stromal cells in the lung tumor microenvironment. Nature Medicine 2018
https://dx.doi.org/10.1038/s41591-018-0096-5
Data set acquisition:
DATASET_ID = "E-MTAB-6149_6653"
TCGA_CODE = 'LUAD_LUSC'
Resources downloaded.
# Directly provided by the author.
# The input matrix was the normalized expression matrix, output from Seurat, from which 9,919 genes passed the filtering
# (sum of expression > 3 × 0.005 × 52,698 and detected in at least 0.5% of the cells).
ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'b_thienpont.clusters.txt')
EXP_MTX_FNAME = os.path.join(RESOURCES_FOLDERNAME, 'E-MTAB-6149_6653.expr.tsv')
Results created.
METADATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.metadata.csv'.format(DATASET_ID))
EXP_MTX_QC_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.qc.expr.csv'.format(DATASET_ID))
ADJACENCIES_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.adjacencies.tsv'.format(DATASET_ID))
MOTIFS_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.motifs.csv'.format(DATASET_ID))
REGULONS_DAT_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.regulons.dat'.format(DATASET_ID))
AUCELL_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.auc.csv'.format(DATASET_ID))
ANNDATA_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.h5ad'.format(DATASET_ID))
LOOM_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}_{}.loom'.format(TCGA_CODE, DATASET_ID))
BIN_MTX_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.bin.csv'.format(DATASET_ID))
THR_FNAME = os.path.join(RESULTS_FOLDERNAME, '{}.thresholds.csv'.format(DATASET_ID))
METADATA CLEANING
annotations = pd.read_csv(ANNOTATIONS_FNAME, sep=' ', index_col=0)
annotations['lesion_type'] = ['LC' if row['CellFromTumor'] else 'NL' for _, row in annotations.iterrows()]
annotations = annotations[['cell', 'CellType', 'Sample', 'Patientnumber', 'lesion_type']].rename(columns={
'cell': 'cell_id',
'Sample': 'sample_id',
'Patientnumber': 'patient_id',
'CellType': 'cell_type'}).set_index('cell_id')
annotations['cell_type'] = annotations.cell_type.replace({'T_cell': 'T cell',
'B_cell': 'B cell',
'Fibro': 'Fibroblast',
'tumor': 'Malignant cell',
'Epi': 'Epithelial cell',
'EC': 'Endothelial cell',
'Alveolar': 'Alveolar cell',
'Myeloid': 'Meyloid cell'})
annotations['patient_id'] = annotations['patient_id'].map(str)
annotations['sample_id'] = annotations['sample_id'].map(str)
annotations.to_csv(METADATA_FNAME, sep=',')
annotations.head()
EXPRESSION MATRIX QC
df_mtx = pd.read_csv(EXP_MTX_FNAME, sep='\t', index_col=0)
adata = sc.AnnData(X=df_mtx.T.sort_index())
adata.obs = annotations.sort_index()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata.var_names_make_unique()
adata.raw = adata #Store non-log transformed data as raw. This data can be used via the use_raw parameters available for many functions.
sc.pp.log1p(adata)
adata
adata.to_df().to_csv(EXP_MTX_QC_FNAME)
For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.
Output: List of adjacencies between a TF and its targets stored in ADJACENCIES_FNAME.
2019-05-02 21:24:43,654 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-05-02 21:27:06,425 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
preparing dask client
parsing input
creating dask graph
8 partitions
computing dask graph
finished
2019-05-03 17:38:49,059 - pyscenic.cli.pyscenic - INFO - Writing results to file.
For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system.
Output: List of adjacencies between a TF and its targets stored in MOTIFS_FNAME.
2019-05-06 20:50:20,880 - pyscenic.cli.pyscenic - INFO - Creating modules.
2019-05-06 20:50:23,528 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.
2019-05-06 20:55:30,410 - pyscenic.cli.pyscenic - INFO - Loading databases.
2019-05-06 20:55:30,410 - pyscenic.cli.pyscenic - INFO - Calculating regulons.
2019-05-06 21:31:19,109 - pyscenic.cli.pyscenic - INFO - Writing results to file.
df_motifs = load_motifs(MOTIFS_FNAME)
REGULON CREATION
regulons = derive_regulons(df_motifs)
# Pickle these regulons.
with open(REGULONS_DAT_FNAME, 'wb') as f:
pickle.dump(regulons, f)
AUCELL
%%time auc_mtx = aucell(exp_mtx, regulons, num_workers=18) auc_mtx.to_csv(AUCELL_MTX_FNAME)
CPU times: user 54.2 s, sys: 31.2 s, total: 1min 25s
Wall time: 1min 43s
auc_mtx = pd.read_csv(AUCELL_MTX_FNAME, index_col=0)
In this step a scanpy.AnnData object is created containing all metadata and results.
First, we select and retain only the most variable genes in the dataset.
sc.pp.highly_variable_genes(adata)
adata = adata[:, adata.var['highly_variable']]
Then we apply a linear-dimensional reduction technique (PCA).
sc.tl.pca(adata, svd_solver='arpack')
Followed by a tSNE projection.
sc.tl.tsne(adata)
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type', 'patient_id'],
title=['E-MTAB-6149/6653 - LUAD/LUSC - Cell types',
'E-MTAB-6149/6653 - LUAD/LUSC - Lesion types',
'E-MTAB-6149/6653 - LUAD/LUSC - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3,
save=' - E-MTAB-6149_6653 - PCA+tSNE.svg')
We capture the non-linear projection based on PCA+tSNE for later storage in the loom file.
embedding_pca_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)
We add all metadata derived from SCENIC to the scanpy.AnnData object.
add_scenic_metadata(adata, auc_mtx, regulons)
adata.write_h5ad(ANNDATA_FNAME)
We change the tSNE projection so that it relies on AUCell instead of PCA.
sc.tl.tsne(adata, use_rep='X_aucell')
embedding_aucell_tsne = pd.DataFrame(adata.obsm['X_tsne'], columns=[['_X', '_Y']], index=adata.obs_names)
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['cell_type', 'lesion_type', 'patient_id'],
title=['E-MTAB-6149/6653 - LUAD/LUSC - Cell types',
'E-MTAB-6149/6653 - LUAD/LUSC - Lesion types',
'E-MTAB-6149/6653 - LUAD/LUSC - {} patients'.format(len(adata.obs.patient_id.unique()))], ncols=3,
save=' - E-MTAB-6149_6653 - AUCell+tSNE.svg')
Display AUCell scores on top of cellular scatterplot for selected regulons.
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['MYC'],
title=['MYC - Gene'], ncols=3, palette=COLORS, use_raw=False,
save=' - E-MTAB-6149_6653 - AUCell+tSNE - Gene MYC.svg', cmap='magma')
sc.set_figure_params(frameon=False, dpi=150, fontsize=8)
sc.pl.tsne(adata, color=['Regulon(MYC)'],
title=['MYC - Regulon'], ncols=3, palette=COLORS, use_raw=False,
save=' - E-MTAB-6149_6653 - AUCell+tSNE - Regulon MYC.svg', cmap='viridis')
To find cell type specific regulators we use a Z score (i.e. the average AUCell score for the cells of a give type are standardized using the overall average AUCell scores and its standard deviation).
df_obs = adata.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['cell_type']]
df_results = ((df_scores.groupby(by='cell_type').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))
df_results[(df_results.Z >= 3.0)].sort_values('Z', ascending=False).head()
df_heatmap = pd.pivot_table(data=df_results[df_results.Z >= 2.5].sort_values('Z', ascending=False),
index='cell_type', columns='regulon', values='Z')
fig, ax1 = plt.subplots(1, 1, figsize=(10, 8))
sns.heatmap(df_heatmap, ax=ax1, annot=True, fmt=".1f", linewidths=.7, cbar=False, square=True, linecolor='gray',
cmap="YlGnBu", annot_kws={"size": 6})
ax1.set_ylabel('')
savesvg('heatmap - E-MTAB-6149_6653 - regulons.svg', fig)
sc.pl.tsne(adata, color=['cell_type', 'Regulon(RFX3)', 'Regulon(FOXO1)'],
title=['E-MTAB-6149/6653 - LUAD/LUSC - Cell types', 'RFX3', 'FOXO1'], ncols=3, use_raw=False,
save=' - E-MTAB-6149_6653 - regulons.svg')
bin_mtx, thresholds = binarize(auc_mtx)
bin_mtx.to_csv(BIN_MTX_FNAME)
thresholds.to_frame().rename(columns={0:'threshold'}).to_csv(THR_FNAME)
bin_mtx = pd.read_csv(BIN_MTX_FNAME, index_col=0)
thresholds = pd.read_csv(THR_FNAME, index_col=0).threshold
export2loom(adata.to_df(), regulons, LOOM_FNAME,
cell_annotations=adata.obs['cell_type'].to_dict(),
embeddings=OrderedDict([('AUCell + tSNE', embedding_aucell_tsne), ('PCA + tSNE', embedding_pca_tsne)]),
auc_mtx = auc_mtx,
tree_structure=(),
title='{}_{}'.format(TCGA_CODE, DATASET_ID),
nomenclature="HGNC", auc_thresholds=thresholds,
compress=True)